s3_220131_cluster_optimization

nlc

01/31/2022

Experiment & contact info

PIs: Rosalie Sears (), Laura Heiser ()

Sample preparation: Zinab Doha ()

Library prep from single cells: Xi Li & Colin Daniel ()

Analysis: Nick Calistri ()

Sequencing performed by OHSU MPSSR

Analysis design

  • load rliger integrated data (s2_vehicle_rliger.rds)

  • Optimize leiden clustering for maximized silhouette width

  • Visualize cluster versus lineage markers

  • Manually assign lineage

  • Run 3D UMAP and visualize clusters + lineage in 3d

  • Find DEGs (upregulated) for each cluster compared to all other clusters

  • Save updated so_merge in updated .rds file

Set up

Load libraries

library(Matrix)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## Warning: package 'tibble' was built under R version 4.1.2
## Warning: package 'tidyr' was built under R version 4.1.2
## Warning: package 'readr' was built under R version 4.1.2
## Warning: package 'dplyr' was built under R version 4.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x tidyr::expand() masks Matrix::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x tidyr::pack()   masks Matrix::pack()
## x tidyr::unpack() masks Matrix::unpack()
library(Seurat)
## Warning: package 'Seurat' was built under R version 4.1.2
## Registered S3 method overwritten by 'spatstat.geom':
##   method     from
##   print.boxx cli
## Attaching SeuratObject
library(cluster)

load s2_vehicle_integrated.rds and set seed

so_merge <- readRDS('analysis_files/s2_vehicle_integrated.rds')

so_merge@meta.data$cell_barcode <- rownames(so_merge@meta.data)

set.seed(3)

Prepare silhoutte scoring function

library(bluster)

cluster_sweep <- function(resolution = seq(from = 0.3, to = 0.8, by = 0.1), seurat_object, embedding, ndims = ncol(emedding)){
  
  
  for(i in resolution){
    seurat_object <- FindClusters(seurat_object,
                                resolution = i,
                                algorithm = 4)
  
    p1 <- DimPlot(seurat_object,
                  group.by = 'seurat_clusters',
                  label = TRUE)+
      coord_equal()
    
    print(p1)
    
    curr_clusters <- seurat_object@meta.data$seurat_clusters 
    
    sil <- approxSilhouette(x = embedding,
                            clusters = curr_clusters)
    
    boxplot(split(sil$width, curr_clusters),
            main = paste0('Resolution: ', i, '\n Mean sil.width: ', mean(sil$width)))
    
    best.choice <- ifelse(sil$width > 0,
                          curr_clusters,
                          sil$other)
    
    table(Assigned=curr_clusters, Closest=best.choice)
    
    # 
    rmsd <- clusterRMSD(embedding, curr_clusters)
    barplot(rmsd,
            main = paste0('Resolution: ', i, '\n Mean rmsd: ', mean(rmsd)))
    
    
    if(i == resolution[1]){
      qc <- tibble(res = i,
                   n_clusters = length(unique(sil$cluster)),
                   mean_sil_width = mean(sil$width),
                   mean_rmsd = mean(rmsd))
    }else{
      qc <- rbind(qc,
                  tibble(res = i,
                  n_clusters = length(unique(sil$cluster)),
                  mean_sil_width = mean(sil$width),
                  mean_rmsd = mean(rmsd)))
    }
    
  }
  return(qc)
}

Cluster optimization (by silhouette width)

reduction_to_use <- 'iNMF'
ndims <- 50

###

ElbowPlot(so_merge, ndims = 50, reduction = 'harmony')

so_merge <- FindNeighbors(so_merge,
                          reduction = reduction_to_use,
                          dims = 1:ndims)
## Computing nearest neighbor graph
## Computing SNN
so_merge <- RunUMAP(so_merge,
                    reduction = reduction_to_use,
                    dims = 1:ndims)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 09:25:38 UMAP embedding parameters a = 0.9922 b = 1.112
## 09:25:38 Read 14042 rows and found 50 numeric columns
## 09:25:38 Using Annoy for neighbor search, n_neighbors = 30
## 09:25:38 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 09:25:41 Writing NN index file to temp file C:\Users\Nick\AppData\Local\Temp\RtmpwBA9Fz\file183026a1557a
## 09:25:41 Searching Annoy index using 1 thread, search_k = 3000
## 09:25:45 Annoy recall = 100%
## 09:25:45 Commencing smooth kNN distance calibration using 1 thread
## 09:25:45 Initializing from normalized Laplacian + noise
## 09:25:46 Commencing optimization for 200 epochs, with 635162 positive edges
## 09:25:58 Optimization finished
cluster_qc <- cluster_sweep(seurat_object = so_merge,
                            embedding = so_merge@reductions$iNMF@cell.embeddings,
                            resolution = seq(from = 0.1, to = 1.5, by = 0.05))

cluster_qc <- cluster_qc %>%
  gather(- c(res), value = 'value', key = 'metric')

ggplot(cluster_qc, aes(x = res, y = value, color = metric))+
  geom_point()+
  geom_line()+
  theme_bw()+
  facet_wrap(~metric, ncol = 3, scales = 'free_y')+
  scale_x_continuous(breaks = seq(from = 0, to = 1.5, by = 0.25))+
  xlab('Leiden resolution')+
  geom_vline(xintercept = 0.45, linetype = 'dashed')

Supplementary figure for clustering optimization

# 1080 x 240
ggplot(cluster_qc, aes(x = res, y = value, color = metric))+
  geom_point()+
  geom_line()+
  theme_bw()+
  facet_wrap(~metric, ncol = 3, scales = 'free_y')+
  scale_x_continuous(breaks = seq(from = 0, to = 1.5, by = 0.25))+
  xlab('Leiden resolution')+
  geom_vline(xintercept = 0.45, linetype = 'dashed')

Cluster based on optimal resolution

so_merge <- FindClusters(so_merge,
                         algorithm = 4,
                         resolution = 0.45)

DimPlot(so_merge, label = TRUE, label.size = 8)+
  coord_equal()+
  theme(legend.position = 'none')

ggplot(so_merge@meta.data, aes(x = seurat_clusters, fill = tumor))+
  geom_bar()+
  theme_bw()+
  facet_wrap(~tumor)

ggplot(so_merge@meta.data, aes(x = seurat_clusters, fill = histology))+
  geom_bar()+
  theme_bw()+
  facet_wrap(~histology, ncol = 1)

Visualize lineage markers by cluster

DotPlot(so_merge,
        features = c('Ptprc', 'Cd74', 'Epcam', 'Cd14', 'Cd3e', 'Pdgfra', 'Krt5', 'Pecam1', 'Mki67', 'Pdgfrb'),
        cols = 'RdYlBu')+
  theme_bw()+
  coord_flip()

Cluster 14 lineage assignment

Subset cluster 14

c14 <- subset(so_merge,
              subset = seurat_clusters == 14)

Harmony is used because the underlying PCA will separate celltype in sparser latent space than iNMF

Harmony + UMAP

library(harmony)
## Loading required package: Rcpp
c14 <- FindVariableFeatures(c14)
c14 <- NormalizeData(c14)
c14 <- ScaleData(c14)
## Centering and scaling data matrix
c14 <- RunPCA(c14)
## PC_ 1 
## Positive:  Tmsb10, Trbc2, Sept1, Cd3g, Ablim1, Cd3d, Ptprcap, Trac, Skap1, S100a10 
##     Il2rb, Ctsw, Ctla2a, Pdcd4, Ltb, Lck, S100a6, Rora, Thy1, Nkg7 
##     Cd226, Cxcr6, Dut, Sh2d2a, Trbc1, Ikzf3, Il18r1, Itgae, Inpp4b, Ahnak 
## Negative:  C1qc, Ctss, C1qa, C1qb, Mpeg1, Cx3cr1, Apoe, Csf1r, Aif1, Lgmn 
##     Cd74, Ctsc, Ms4a7, Cxcl16, Cst3, Ctsz, Hexb, Gatm, Cybb, Tyrobp 
##     Tgfbi, Lyz2, Cd68, Ctsb, Pld4, Psap, Ftl1, H2-Ab1, Fcer1g, Man2b1 
## PC_ 2 
## Positive:  Lcn2, Wfdc18, Csn3, Epcam, Nfib, Fxyd3, Trf, Cldn7, Cldn3, Igfbp5 
##     Cald1, Krt8, Dbi, Krt18, Aqp5, Sox9, Plet1, Sox4, Phlda1, Nfix 
##     Nedd4, Scgb2b27, Auts2, Apoc1, Tpm1, Cd9, Fermt2, Chil1, Ptprf, Cd24a 
## Negative:  AW112010, Srgn, Ltb, Crip1, Ptprcap, Ctsw, Vim, Cd3g, Ablim1, Sept1 
##     Il2rb, Nkg7, S100a4, Trbc2, Gimap4, Skap1, Ifngr1, Rgs1, Itgae, S100a10 
##     Cd226, Inpp4b, Bcl2, Lck, Esyt1, H2-Q7, Ptpn22, Cd3d, Atad2, Gzmb 
## PC_ 3 
## Positive:  AY036118, Igkc, Maf, Il7r, Gm26917, Ccnb1ip1, Icos, Apoe, Cxcr6, Cd6 
##     Rora, Lst1, Cd74, Il2ra, Ms4a4b, Rmrp, Lgals1, Hbb-bs, Trac, Hba-a1 
##     Ly6a, Ccr2, Cd3d, Cd4, Il18r1, Hbb-bt, Ttc39c, H2-Aa, Ifi27l2a, Thy1 
## Negative:  Cemip2, Gzmb, Car2, Xcl1, Hsp90ab1, Rps2, Klra5, Ier5l, Ctla2b, Junb 
##     Rps18, Cdv3, Npm1, Hspd1, Fasl, Nme2, Nme1, Eif5a, Errfi1, Rpl12 
##     Sult2b1, Cebpb, Eef1b2, Nr4a2, Rps3, Eef1g, Klrk1, Itih5, Sfr1, Cited4 
## PC_ 4 
## Positive:  Lgals1, Cavin3, Maf, S100a4, S100a10, Rhobtb3, Tmem64, Serpinb1a, Palld, Cav1 
##     Trdc, Cd163l1, Il7r, Rora, Ptpn14, Col6a1, Ramp1, Capg, Ppic, Phlda3 
##     Fat1, Igfbp3, Trdv4, Cx3cl1, Tpm2, Timp1, Krt15, Cd3d, Fkbp9, Ikzf2 
## Negative:  Xcl1, Scgb2b27, Klrd1, Nr4a2, Scgb1b27, Cemip2, Gzmb, Ncr1, Serpinb9, Klra5 
##     Car6, Ctla2a, Etv1, Wfdc18, Gzma, Vps37b, Serpinb6b, Chn2, Errfi1, Aqp5 
##     Neurl3, Ppp1r3b, Cd7, Klre1, Nkg7, Hba-a1, Hbb-bs, Gzmc, Ctla2b, Lcn2 
## PC_ 5 
## Positive:  Krt15, Dsc3, Krt5, Krt14, Sparc, Fat1, Fkbp9, Vcan, Igsf10, Ppic 
##     Tagln, Slpi, Trp63, Cav1, Acta2, Postn, S100a16, Serpine2, A430105J06Rik, Igfbp3 
##     Hmcn1, Cxcl14, Phlda3, Sfrp1, Brinp3, Col4a1, Tpm2, Palld, Col6a1, Gas1 
## Negative:  Nme1, Nme2, Chchd10, Kcnk1, Rplp0, Rps3, Elf5, Atp2c2, 4931406C07Rik, Cldn7 
##     Cystm1, Kcnn4, Eef1b2, Ppp1r2, Wfdc18, Npm1, Prss8, Rpl12, 1110008P14Rik, Erh 
##     Nhp2, Ass1, Epcam, Rpsa, Smim22, Cd24a, Plet1, Rab25, Cldn3, Ehf
c14 <- RunHarmony(c14,
                        group.by.vars = 'library_id')
## Harmony 1/10
## Harmony 2/10
## Harmony 3/10
## Harmony 4/10
## Harmony 5/10
## Harmony 6/10
## Harmony 7/10
## Harmony converged after 7 iterations
## Warning: Invalid name supplied, making object name syntactically valid. New
## object name is Seurat..ProjectDim.RNA.harmony; see ?make.names for more details
## on syntax validity
ElbowPlot(c14)

c14 <- RunUMAP(c14, reduction = 'harmony', dims = 1:5,
                  seed.use = 500)
## 09:45:05 UMAP embedding parameters a = 0.9922 b = 1.112
## 09:45:05 Read 340 rows and found 5 numeric columns
## 09:45:05 Using Annoy for neighbor search, n_neighbors = 30
## 09:45:05 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 09:45:05 Writing NN index file to temp file C:\Users\Nick\AppData\Local\Temp\RtmpwBA9Fz\file18307e5632a1
## 09:45:05 Searching Annoy index using 1 thread, search_k = 3000
## 09:45:05 Annoy recall = 100%
## 09:45:06 Commencing smooth kNN distance calibration using 1 thread
## 09:45:06 Initializing from normalized Laplacian + noise
## 09:45:06 Commencing optimization for 500 epochs, with 11592 positive edges
## 09:45:07 Optimization finished
DimPlot(c14, label = TRUE)+
  coord_equal()

FeaturePlot(c14,
            features = c('Epcam', 'Cd3e', 'Cd14', 'Pdgfra'))

Subcluster

c14 <- FindNeighbors(c14,
                           reduction = 'harmony',
                           dims = 1:5)
## Computing nearest neighbor graph
## Computing SNN
cluster_qc <- cluster_sweep(seurat_object = c14,
                            embedding = c14@reductions$harmony@cell.embeddings[,1:5],
                            resolution = seq(from = 0.1, to = 1.5, by = 0.1))

cluster_qc <- cluster_qc %>%
  gather(c(-res, -n_clusters), value = 'value', key = 'metric')

ggplot(cluster_qc, aes(x = res, y = value, color = metric))+
  geom_point(aes(size = n_clusters))+
  geom_line()+
  theme_bw()+
  facet_wrap(~metric, ncol = 1, scales = 'free_y')

c14 <- FindClusters(c14,
                          resolution = 0.1,
                          algorithm = 4)

DotPlot(c14,
        features = c('Cd14', 'Cd3e', 'Epcam', 'Pdgfra', 'Pecam1'),
        cols = 'RdYlBu')+
  coord_flip()+
  RotatedAxis()

DimPlot(c14, label = TRUE, label.size = 8)+
  coord_equal()+
  theme(legend.position = 'none')

FeaturePlot(c14, features = c('Cd3e', 'Cd14', 'Cd74', 'Epcam'))

DotPlot(c14,
        features = c('Ptprc', 'Cd74', 'Epcam', 'Cd14', 'Cd3e', 'Pdgfra', 'Krt5', 'Pecam1', 'Mki67', 'Pdgfrb'),
        cols = 'RdYlBu')+
  theme_bw()+
  coord_flip()

c14_subcluster_dict <- tibble(cell_barcode = rownames(c14@meta.data),
                              cluster_l2 = 14 + as.numeric(c14@meta.data$seurat_clusters)/10)

Rejoin subclustered with the rest of data

# Add c14 subcluster info back to main seurat object
so_merge@meta.data$cell_barcode <- rownames(so_merge@meta.data)

so_merge@meta.data <- full_join(x = so_merge@meta.data,
                                y = c14_subcluster_dict,
                                by = 'cell_barcode',
                                suffix = c('', ''))

rownames(so_merge@meta.data) <- so_merge@meta.data$cell_barcode

# Fill in cluster_l2 as seurat_clusters for each cluster without subclusters
so_merge@meta.data$cluster_l2[is.na(so_merge@meta.data$cluster_l2)] <- so_merge@meta.data$seurat_clusters[is.na(so_merge@meta.data$cluster_l2)]

Idents(so_merge) <- 'cluster_l2'

Assign lineage

Lineage is biological interpretation from major markers.

DotPlot(so_merge,
        features = c('Ptprc', 'Cd74', 'Epcam', 'Cd14', 'Cd3e', 'Pdgfra', 'Krt5', 'Pecam1', 'Mki67', 'Pdgfrb'),
        cols = 'RdYlBu')+
  theme_bw()+
  coord_flip()

epi_clusters <- c(2, 8, 10, 19, 14.3)
lymphoid_clusters <- c(5, 6, 9, 13, 14.1, 21, 22)
myeloid_clusters <- c(1, 3, 4, 14.2, 15, 16, 18, 20, 24)
fibroblast_clusters <- c(7, 12, 17)
endothelial_clusters <- c(11)
perivascular_clusters <- c(23)


lineage_dict <- tibble(lineage = c(rep('epithelial', length(epi_clusters)),
                                   rep('lymphoid', length(lymphoid_clusters)),
                                   rep('myeloid', length(myeloid_clusters)),
                                   rep('fibroblast', length(fibroblast_clusters)),
                                   rep('endothelial', length(endothelial_clusters)),
                                   rep('perivascular', length(perivascular_clusters))),
                      cluster_id = c(epi_clusters, lymphoid_clusters, myeloid_clusters, fibroblast_clusters,
                                     endothelial_clusters, perivascular_clusters))

so_merge[['lineage']] <- plyr::mapvalues(x = so_merge@meta.data$cluster_l2,
                                             from = lineage_dict$cluster_id,
                                             to = lineage_dict$lineage)

# Make sure cluster_l2 and lineage are factors for Seurat plotting compatibility
so_merge@meta.data$lineage <- as.factor(so_merge@meta.data$lineage)
so_merge@meta.data$cluster_l2 <- as.factor(so_merge@meta.data$cluster_l2)

DotPlot(so_merge,
        features = c( 'Pecam1', 'Epcam', 'Krt5', 'Pdgfra', 'Ptprc', 'Cd3e', 'Cd14', 'Cd74', 'Pdgfrb'),
        cols = 'RdYlBu',
        group.by = 'lineage')+
  coord_flip()+
  RotatedAxis()

Cluster relationship investigation

Lineage vs nmf component score

# Average iNMF embeddings to show relationship to lineage
cluster_embedding <- tibble(lineage = so_merge@meta.data$lineage) %>%
  cbind(so_merge@reductions$iNMF@cell.embeddings) %>%
  group_by(lineage) %>%
  summarize(across(everything(), list(mean)))

pheatmap::pheatmap(data.frame(cluster_embedding[,2:ncol(cluster_embedding)],
                              row.names = cluster_embedding$lineage),
                   scale = 'column',
                   main = 'Average iNMF embedding per cluster \n column z-scored')

# Average iNMF embedding to show relationship to cluster, annotated by lineage
cluster_embedding <- tibble(cluster_l2 = so_merge@meta.data$cluster_l2) %>%
  cbind(so_merge@reductions$iNMF@cell.embeddings) %>%
  group_by(cluster_l2) %>%
  summarize(across(everything(), list(mean)))

annotation_rows <- data.frame(lineage = lineage_dict$lineage,
                              row.names = lineage_dict$cluster_id)

pheatmap::pheatmap(data.frame(cluster_embedding[,2:ncol(cluster_embedding)],
                              row.names = cluster_embedding$cluster_l2),
                   scale = 'row',
                   main = 'Average iNMF embedding per cluster \n row z-scored',
                   annotation_row = annotation_rows,
                   cutree_rows = 7)

pheatmap::pheatmap(data.frame(cluster_embedding[,2:ncol(cluster_embedding)],
                              row.names = cluster_embedding$cluster_l2),
                   scale = 'column',
                   main = 'Average iNMF embedding per cluster \n column z-scored',
                   annotation_row = annotation_rows,
                   cutree_rows = 6)

DimPlot(so_merge,
        label = TRUE,
        label.size = 6,
        group.by = 'lineage')+
  coord_equal()

Correlation between clusters

code adapted from: https://github.com/satijalab/seurat/issues/1552

Idents(so_merge) <- 'cluster_l2'

av.exp <- AverageExpression(so_merge)$RNA[VariableFeatures(so_merge),]
cor.exp <- as.data.frame(cor(av.exp, method = 'pearson'))

pheatmap::pheatmap(cor.exp,
                   annotation_row = annotation_rows)

DimPlot(so_merge,
        label = TRUE)+
  coord_equal()

Celltype fractions

Visualize celltype fractions per tumor

so_merge_meta <- so_merge@meta.data %>%
  mutate(tumor_pheno = paste0(tumor, '-', histology))

freq_table <- so_merge_meta %>%
  dplyr::select(lineage, tumor_pheno) %>%
  table()

col_anno <- data.frame(histology = str_split(colnames(freq_table), pattern = '-', simplify = TRUE)[,2],
                       log10_n_cells = log10(colSums(freq_table)),
                       row.names = colnames(freq_table))

pheatmap::pheatmap(prop.table(freq_table, margin = 2), 
                   display_numbers = freq_table,
                   main = 'celltype fraction \n Number = n_cells per celltype per tumor',
                   annotation_col = col_anno)

Assign celltype_frac tumor type

  • SP = stromal poor
  • SR_IP = stromal-rich immune poor
  • SR_IR = stromal-rich immune rich
sp <- c('V8_T1', 'V3_T2', 'V4', 'V3_T1')
srip <- c('V6_T1', 'V6_T3')
srir <- c('V7_T3', 'V7_T1', 'V7_T2', 'V5', 'V6_T2')

cellfrac_assignments <- data.frame(rbind(cbind('SP', sp),
                                   cbind('SR_IP', srip),
                                   cbind('SR_IR', srir)))

colnames(cellfrac_assignments) <- c('cellfrac_type', 'tumor')

so_merge@meta.data$cellfrac_type <- plyr::mapvalues(x = so_merge@meta.data$tumor,
                                              from = cellfrac_assignments$tumor,
                                               to = cellfrac_assignments$cellfrac_type)

# Visualize results
table(so_merge@meta.data$cellfrac_type) %>%
  knitr::kable()
Var1 Freq
SP 3346
SR_IP 1278
SR_IR 9418
DimPlot(so_merge,
        group.by = 'cellfrac_type')+
  facet_wrap(~cellfrac_type)

so_meta <- so_merge@meta.data

ggplot(so_meta, aes(x = cluster_l2, fill = cellfrac_type))+
  geom_bar()+
  theme_bw()+
  facet_grid(cellfrac_type~lineage, scales = 'free', space = 'free_x')+
  RotatedAxis()

3d umap

so_merge <- RunUMAP(so_merge,
                    reduction = 'iNMF',
                    dims = 1:50,
                    n.components = 3L,
                    reduction.name = 'umap3d',
                    reduction.key = 'umap3d_')
## 09:45:27 UMAP embedding parameters a = 0.9922 b = 1.112
## 09:45:27 Read 14042 rows and found 50 numeric columns
## 09:45:27 Using Annoy for neighbor search, n_neighbors = 30
## 09:45:27 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 09:45:30 Writing NN index file to temp file C:\Users\Nick\AppData\Local\Temp\RtmpwBA9Fz\file1830501a1548
## 09:45:30 Searching Annoy index using 1 thread, search_k = 3000
## 09:45:33 Annoy recall = 100%
## 09:45:33 Commencing smooth kNN distance calibration using 1 thread
## 09:45:34 Initializing from normalized Laplacian + noise
## 09:45:35 Commencing optimization for 200 epochs, with 635162 positive edges
## 09:45:47 Optimization finished
so_meta <- so_merge@meta.data %>%
  mutate(umap3d_1 = so_merge@reductions$umap3d@cell.embeddings[,1]) %>%
  mutate(umap3d_2 = so_merge@reductions$umap3d@cell.embeddings[,2]) %>%
  mutate(umap3d_3 = so_merge@reductions$umap3d@cell.embeddings[,3])

plotly::plot_ly(so_meta,
                x = ~umap3d_1,
                y = ~umap3d_2,
                z = ~umap3d_3,
                color = ~seurat_clusters)
## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
plotly::plot_ly(so_meta,
                x = ~umap3d_1,
                y = ~umap3d_2,
                z = ~umap3d_3,
                color = ~lineage)
## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

Find DEG & enriched ontologies for each cluster overall

load gene enrichment libraries

library(clusterProfiler)
## 
## clusterProfiler v4.0.5  For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
## 
## If you use clusterProfiler in published research, please cite:
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, 2(3):100141. doi: 10.1016/j.xinn.2021.100141
## 
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:purrr':
## 
##     simplify
## The following object is masked from 'package:stats':
## 
##     filter
library(org.Mm.eg.db)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
## Warning: package 'S4Vectors' was built under R version 4.1.2
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:clusterProfiler':
## 
##     rename
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:tidyr':
## 
##     expand
## The following objects are masked from 'package:Matrix':
## 
##     expand, unname
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:clusterProfiler':
## 
##     slice
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## The following object is masked from 'package:purrr':
## 
##     reduce
## The following object is masked from 'package:grDevices':
## 
##     windows
## 
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:clusterProfiler':
## 
##     select
## The following object is masked from 'package:dplyr':
## 
##     select
## 

Find DEG

Idents(so_merge) <- 'cluster_l2'
markers <- FindAllMarkers(so_merge)
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
## Calculating cluster 12
## Calculating cluster 13
## Calculating cluster 14.1
## Calculating cluster 14.2
## Calculating cluster 14.3
## Calculating cluster 15
## Calculating cluster 16
## Calculating cluster 17
## Calculating cluster 18
## Calculating cluster 19
## Calculating cluster 20
## Calculating cluster 21
## Calculating cluster 22
## Calculating cluster 23
## Calculating cluster 24
top_markers <- markers %>%
  group_by(cluster) %>%
  arrange(desc(avg_log2FC)) %>%
  slice_head(n = 5)

# Scale data for all genes for heatmap purposes
so_merge <- ScaleData(so_merge, 
                      features = rownames(so_merge))
## Centering and scaling data matrix
DoHeatmap(so_merge, features = top_markers$gene)

write_csv(markers, 'analysis_files/s3_cluster_l2_markers.csv')

Save DEG for each lineage that has more than one cluster

for(i in unique(so_merge@meta.data$lineage)){
  
  lineage_clusters <- so_merge@meta.data %>%
    filter(lineage == i) %>%
    pull(cluster_l2) %>%
    unique() %>%
    droplevels()
  
  if(length(lineage_clusters) > 1){
    
    lineage_markers <-markers %>%
      filter(cluster %in% lineage_clusters)
    
    write_csv(lineage_markers, paste0('analysis_files/s3_', i, '_markers.csv'))
    
  }
  
}

save rds output

saveRDS(so_merge, file = 'analysis_files/s3_celltypes.rds')

sessionInfo()

sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19044)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] org.Mm.eg.db_3.13.0   AnnotationDbi_1.56.2  IRanges_2.28.0       
##  [4] S4Vectors_0.32.3      Biobase_2.54.0        BiocGenerics_0.40.0  
##  [7] clusterProfiler_4.0.5 harmony_0.1.0         Rcpp_1.0.7           
## [10] bluster_1.2.1         cluster_2.1.2         SeuratObject_4.0.4   
## [13] Seurat_4.1.0          forcats_0.5.1         stringr_1.4.0        
## [16] dplyr_1.0.8           purrr_0.3.4           readr_2.1.2          
## [19] tidyr_1.2.0           tibble_3.1.6          ggplot2_3.3.5        
## [22] tidyverse_1.3.1       Matrix_1.3-4         
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.2             reticulate_1.24        tidyselect_1.1.2      
##   [4] RSQLite_2.2.9          htmlwidgets_1.5.4      grid_4.1.1            
##   [7] BiocParallel_1.28.3    Rtsne_0.15             scatterpie_0.1.7      
##  [10] munsell_0.5.0          codetools_0.2-18       ica_1.0-2             
##  [13] future_1.24.0          miniUI_0.1.1.1         withr_2.5.0           
##  [16] spatstat.random_2.1-0  colorspace_2.0-3       GOSemSim_2.18.1       
##  [19] highr_0.9              knitr_1.37             rstudioapi_0.13       
##  [22] ROCR_1.0-11            tensor_1.5             DOSE_3.18.3           
##  [25] listenv_0.8.0          labeling_0.4.2         GenomeInfoDbData_1.2.7
##  [28] polyclip_1.10-0        bit64_4.0.5            farver_2.1.0          
##  [31] pheatmap_1.0.12        downloader_0.4         treeio_1.16.2         
##  [34] parallelly_1.30.0      vctrs_0.3.8            generics_0.1.2        
##  [37] xfun_0.29              R6_2.5.1               GenomeInfoDb_1.30.1   
##  [40] graphlayouts_0.7.1     rmdformats_1.0.3       gridGraphics_0.5-1    
##  [43] fgsea_1.18.0           bitops_1.0-7           spatstat.utils_2.3-0  
##  [46] cachem_1.0.6           assertthat_0.2.1       vroom_1.5.7           
##  [49] promises_1.2.0.1       scales_1.1.1           ggraph_2.0.5          
##  [52] enrichplot_1.12.3      gtable_0.3.0           globals_0.14.0        
##  [55] goftest_1.2-3          tidygraph_1.2.0        rlang_1.0.1           
##  [58] splines_4.1.1          lazyeval_0.2.2         spatstat.geom_2.3-2   
##  [61] broom_0.7.12           yaml_2.3.5             reshape2_1.4.4        
##  [64] abind_1.4-5            modelr_0.1.8           crosstalk_1.2.0       
##  [67] backports_1.3.0        httpuv_1.6.5           qvalue_2.24.0         
##  [70] tools_4.1.1            bookdown_0.24          ggplotify_0.1.0       
##  [73] ellipsis_0.3.2         spatstat.core_2.4-0    jquerylib_0.1.4       
##  [76] RColorBrewer_1.1-2     ggridges_0.5.3         plyr_1.8.6            
##  [79] zlibbioc_1.40.0        RCurl_1.98-1.5         rpart_4.1-15          
##  [82] deldir_1.0-6           viridis_0.6.2          pbapply_1.5-0         
##  [85] cowplot_1.1.1          zoo_1.8-9              haven_2.4.3           
##  [88] ggrepel_0.9.1          fs_1.5.2               magrittr_2.0.1        
##  [91] data.table_1.14.2      RSpectra_0.16-0        scattermore_0.8       
##  [94] DO.db_2.9              lmtest_0.9-39          reprex_2.0.1          
##  [97] RANN_2.6.1             fitdistrplus_1.1-6     matrixStats_0.61.0    
## [100] hms_1.1.1              patchwork_1.1.1        mime_0.12             
## [103] evaluate_0.15          xtable_1.8-4           readxl_1.3.1          
## [106] gridExtra_2.3          compiler_4.1.1         shadowtext_0.1.1      
## [109] KernSmooth_2.23-20     crayon_1.5.0           htmltools_0.5.2       
## [112] ggfun_0.0.5            mgcv_1.8-36            later_1.3.0           
## [115] tzdb_0.2.0             aplot_0.1.2            lubridate_1.8.0       
## [118] DBI_1.1.2              tweenr_1.0.2           dbplyr_2.1.1          
## [121] MASS_7.3-54            cli_3.1.1              parallel_4.1.1        
## [124] igraph_1.2.7           pkgconfig_2.0.3        plotly_4.10.0         
## [127] spatstat.sparse_2.1-0  xml2_1.3.3             ggtree_3.0.4          
## [130] bslib_0.3.1            XVector_0.34.0         rvest_1.0.2           
## [133] yulab.utils_0.0.4      digest_0.6.27          sctransform_0.3.3     
## [136] RcppAnnoy_0.0.19       spatstat.data_2.1-2    Biostrings_2.62.0     
## [139] fastmatch_1.1-3        rmarkdown_2.11         cellranger_1.1.0      
## [142] leiden_0.3.9           tidytree_0.3.8         uwot_0.1.11           
## [145] shiny_1.7.1            lifecycle_1.0.1        nlme_3.1-152          
## [148] jsonlite_1.7.2         BiocNeighbors_1.10.0   limma_3.48.3          
## [151] viridisLite_0.4.0      fansi_1.0.2            pillar_1.7.0          
## [154] lattice_0.20-44        KEGGREST_1.34.0        fastmap_1.1.0         
## [157] httr_1.4.2             survival_3.2-11        GO.db_3.13.0          
## [160] glue_1.6.1             png_0.1-7              bit_4.0.4             
## [163] ggforce_0.3.3          stringi_1.7.6          sass_0.4.0            
## [166] blob_1.2.2             memoise_2.0.1          ape_5.5               
## [169] irlba_2.3.5            future.apply_1.8.1